{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# [ Chapter 4 - Crowdsourced Relevance ] \n",
    "# Setting up the Retrotech Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "\n",
    "from aips import display_product_search, get_engine\n",
    "from aips.spark.dataframe import from_csv\n",
    "\n",
    "engine = get_engine()\n",
    "spark = SparkSession.builder.appName(\"AIPS\").getOrCreate()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Download the Retrotech (Ecommerce) Products + Signals Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cloning into 'retrotech'...\n",
      "remote: Enumerating objects: 19, done.\u001b[K\n",
      "remote: Counting objects: 100% (19/19), done.\u001b[K\n",
      "remote: Compressing objects: 100% (19/19), done.\u001b[K\n",
      "remote: Total 19 (delta 0), reused 17 (delta 0), pack-reused 0 (from 0)\u001b[K\n",
      "Receiving objects: 100% (19/19), 48.28 MiB | 25.26 MiB/s, done.\n",
      "Already up to date.\n",
      "products.csv\n",
      "signals.csv\n"
     ]
    }
   ],
   "source": [
    "#Get datasets\n",
    "![ ! -d 'retrotech' ] && git clone --depth 1 https://github.com/ai-powered-search/retrotech.git\n",
    "! cd retrotech && git pull\n",
    "! cd retrotech && mkdir -p '../data/retrotech/' && tar -xvf products.tgz -C '../data/retrotech/' && tar -xvf signals.tgz -C '../data/retrotech/'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get a Feel for the Product Catalog"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Listing 4.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\"upc\",\"name\",\"manufacturer\",\"short_description\",\"long_description\"\n",
      "\"096009010836\",\"Fists of Bruce Lee - Dolby - DVD\", , , \n",
      "\"043396061965\",\"The Professional - Widescreen Uncut - DVD\", , , \n",
      "\"085391862024\",\"Pokemon the Movie: 2000 - DVD\", , , \n",
      "\"067003016025\",\"Summerbreeze - CD\",\"Nettwerk\", , \n",
      "\"731454813822\",\"Back for the First Time [PA] - CD\",\"Def Jam South\", , \n",
      "\"024543008200\",\"Big Momma's House - Widescreen - DVD\", , , \n",
      "\"031398751823\",\"Kids - DVD\", , , \n",
      "\"037628413929\",\"20 Grandes Exitos - CD\",\"Sony Discos Inc.\", , \n",
      "\"060768972223\",\"Power Of Trinity (Box) - CD\",\"Sanctuary Records\", , \n"
     ]
    }
   ],
   "source": [
    "! cd data/retrotech/ && head products.csv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Index the Products into the Search Engine"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Listing 4.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wiping \"products\" collection\n",
      "Creating \"products\" collection\n",
      "Status: Success\n",
      "Loading Products\n",
      "Schema: \n",
      "root\n",
      " |-- upc: string (nullable = true)\n",
      " |-- name: string (nullable = true)\n",
      " |-- manufacturer: string (nullable = true)\n",
      " |-- short_description: string (nullable = true)\n",
      " |-- long_description: string (nullable = true)\n",
      "\n",
      "Successfully written 48194 documents\n"
     ]
    }
   ],
   "source": [
    "from aips.data_loaders.products import load_dataframe\n",
    "\n",
    "products_collection = engine.create_collection(\"products\")\n",
    "products_dataframe = load_dataframe(\"data/retrotech/products.csv\")\n",
    "products_collection.write(products_dataframe)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Verify Searches Work"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Listing 4.3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def product_search_request(query, param_overrides={}):\n",
    "    request = {\"query\": query,\n",
    "               \"query_fields\": [\"name\", \"manufacturer\", \"long_description\"],\n",
    "               \"return_fields\": [\"upc\", \"name\", \"manufacturer\",\n",
    "                                 \"short_description\", \"score\"],\n",
    "               \"limit\": 5,\n",
    "               \"order_by\": [(\"score\", \"desc\"), (\"upc\", \"asc\")]}\n",
    "    return request | param_overrides"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div id=\"demo\">\n",
       "\t<center><input style=\"width:40%\" readonly type=\"text\" name=\"q\" value=\"ipod\">\n",
       "\t<input readonly type=\"submit\" value=\"Search\">\n",
       "\t</center>\n",
       "    <div class=\"results\">\n",
       "    \t\n",
       "\n",
       "    </div>\n",
       "</div>\n",
       "    \t<div style=\"position: relative; width: 100%; height:auto; overflow: auto;\">\n",
       "\t    \t<div style=\"position: relative; float:left; width: 120px; margin-top:5px\">\n",
       "\t    \t\t<img style=\"width:250px; height: auto; max-height:150px\" src=\"../../data/retrotech/images/878816004532.jpg\">\n",
       "\t    \t</div>\n",
       "\t    \t<div style=\"position:relative; float:left; clear:none; width: 80%; height:auto\">\n",
       "\t    \t\t<p style=\"font-size:24px; padding-left: 50px;\"><strong>\t\tName:</strong> Fusion - Apple&#xAE; iPod&#xAE; Dock for Most Fusion 600 Series Stereos\n",
       "\t\t\t\t\t<br><strong>\t\tManufacturer:</strong> Fusion</p>\n",
       "\t    \t\t</p>\n",
       "\t    \t</div>\n",
       "    \t</div>\n",
       "    \t\n",
       "\t\t<div style=\"position:relative; clear:both; content: ' '; display: block; height: 1px; margin-top: 10px; margin-bottom:20px\">\n",
       "\t\t\t<hr style=\"color: gray; width: 95%;\" />\n",
       "\t\t</div>\n",
       "\t\t\n",
       "    \t<div style=\"position: relative; width: 100%; height:auto; overflow: auto;\">\n",
       "\t    \t<div style=\"position: relative; float:left; width: 120px; margin-top:5px\">\n",
       "\t    \t\t<img style=\"width:250px; height: auto; max-height:150px\" src=\"../../data/retrotech/images/694318011294.jpg\">\n",
       "\t    \t</div>\n",
       "\t    \t<div style=\"position:relative; float:left; clear:none; width: 80%; height:auto\">\n",
       "\t    \t\t<p style=\"font-size:24px; padding-left: 50px;\"><strong>\t\tName:</strong> Alesis - Multimix 8-Channel USB 2.0 Mixer\n",
       "\t\t\t\t\t<br><strong>\t\tManufacturer:</strong> Alesis</p>\n",
       "\t    \t\t</p>\n",
       "\t    \t</div>\n",
       "    \t</div>\n",
       "    \t\n",
       "\t\t<div style=\"position:relative; clear:both; content: ' '; display: block; height: 1px; margin-top: 10px; margin-bottom:20px\">\n",
       "\t\t\t<hr style=\"color: gray; width: 95%;\" />\n",
       "\t\t</div>\n",
       "\t\t\n",
       "    \t<div style=\"position: relative; width: 100%; height:auto; overflow: auto;\">\n",
       "\t    \t<div style=\"position: relative; float:left; width: 120px; margin-top:5px\">\n",
       "\t    \t\t<img style=\"width:250px; height: auto; max-height:150px\" src=\"../../data/retrotech/images/885909394845.jpg\">\n",
       "\t    \t</div>\n",
       "\t    \t<div style=\"position:relative; float:left; clear:none; width: 80%; height:auto\">\n",
       "\t    \t\t<p style=\"font-size:24px; padding-left: 50px;\"><strong>\t\tName:</strong> Apple&#xAE; - iPod touch&#xAE; 8GB* MP3 Player (4th Generation - Latest Model) - Black\n",
       "\t\t\t\t\t<br><strong>\t\tManufacturer:</strong> Apple&#xAE;</p>\n",
       "\t    \t\t</p>\n",
       "\t    \t</div>\n",
       "    \t</div>\n",
       "    \t\n",
       "\t\t<div style=\"position:relative; clear:both; content: ' '; display: block; height: 1px; margin-top: 10px; margin-bottom:20px\">\n",
       "\t\t\t<hr style=\"color: gray; width: 95%;\" />\n",
       "\t\t</div>\n",
       "\t\t\n",
       "    \t<div style=\"position: relative; width: 100%; height:auto; overflow: auto;\">\n",
       "\t    \t<div style=\"position: relative; float:left; width: 120px; margin-top:5px\">\n",
       "\t    \t\t<img style=\"width:250px; height: auto; max-height:150px\" src=\"../../data/retrotech/images/885909300549.jpg\">\n",
       "\t    \t</div>\n",
       "\t    \t<div style=\"position:relative; float:left; clear:none; width: 80%; height:auto\">\n",
       "\t    \t\t<p style=\"font-size:24px; padding-left: 50px;\"><strong>\t\tName:</strong> Apple - USB Charge/Sync Cable for Apple&#xAE; iPod&#xAE; shuffle\n",
       "\t\t\t\t\t<br><strong>\t\tManufacturer:</strong> Apple</p>\n",
       "\t    \t\t</p>\n",
       "\t    \t</div>\n",
       "    \t</div>\n",
       "    \t\n",
       "\t\t<div style=\"position:relative; clear:both; content: ' '; display: block; height: 1px; margin-top: 10px; margin-bottom:20px\">\n",
       "\t\t\t<hr style=\"color: gray; width: 95%;\" />\n",
       "\t\t</div>\n",
       "\t\t\n",
       "    \t<div style=\"position: relative; width: 100%; height:auto; overflow: auto;\">\n",
       "\t    \t<div style=\"position: relative; float:left; width: 120px; margin-top:5px\">\n",
       "\t    \t\t<img style=\"width:250px; height: auto; max-height:150px\" src=\"../../data/retrotech/images/27108936499.jpg\">\n",
       "\t    \t</div>\n",
       "\t    \t<div style=\"position:relative; float:left; clear:none; width: 80%; height:auto\">\n",
       "\t    \t\t<p style=\"font-size:24px; padding-left: 50px;\"><strong>\t\tName:</strong> Yamaha - Apple&#xAE; iPod&#xAE; and iPhone&#xAE; Dock for Most Yamaha A/V Receivers\n",
       "\t\t\t\t\t<br><strong>\t\tManufacturer:</strong> Yamaha</p>\n",
       "\t    \t\t</p>\n",
       "\t    \t</div>\n",
       "    \t</div>\n",
       "    \t\n",
       "\t\t<div style=\"position:relative; clear:both; content: ' '; display: block; height: 1px; margin-top: 10px; margin-bottom:20px\">\n",
       "\t\t\t<hr style=\"color: gray; width: 95%;\" />\n",
       "\t\t</div>\n",
       "\t\t"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "query = \"ipod\"\n",
    "request = product_search_request(query)\n",
    "response = products_collection.search(**request)\n",
    "display_product_search(query, response[\"docs\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get a Feel for the Signals Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\"query_id\",\"user\",\"type\",\"target\",\"signal_time\"\n",
      "\"u2_0_1\",\"u2\",\"query\",\"nook\",\"2019-07-31 08:49:07.3116\"\n",
      "\"u2_1_2\",\"u2\",\"query\",\"rca\",\"2020-05-04 08:28:21.1848\"\n",
      "\"u3_0_1\",\"u3\",\"query\",\"macbook\",\"2019-12-22 00:07:07.0152\"\n",
      "\"u4_0_1\",\"u4\",\"query\",\"Tv antenna\",\"2019-08-22 23:45:54.1030\"\n",
      "\"u5_0_1\",\"u5\",\"query\",\"AC power cord\",\"2019-10-20 08:27:00.1600\"\n",
      "\"u6_0_1\",\"u6\",\"query\",\"Watch The Throne\",\"2019-09-18 11:59:53.7470\"\n",
      "\"u7_0_1\",\"u7\",\"query\",\"Camcorder\",\"2020-02-25 13:02:29.3089\"\n",
      "\"u9_0_1\",\"u9\",\"query\",\"wireless headphones\",\"2020-04-26 04:26:09.7198\"\n",
      "\"u10_0_1\",\"u10\",\"query\",\"Xbox\",\"2019-09-13 16:26:12.0132\"\n"
     ]
    }
   ],
   "source": [
    "! cd data/retrotech && head signals.csv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Index the Signals into the Search Engine"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Listing 4.4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wiping \"signals\" collection\n",
      "Creating \"signals\" collection\n",
      "Status: Success\n",
      "Loading data/retrotech/signals.csv\n",
      "Schema: \n",
      "root\n",
      " |-- query_id: string (nullable = true)\n",
      " |-- user: string (nullable = true)\n",
      " |-- type: string (nullable = true)\n",
      " |-- target: string (nullable = true)\n",
      " |-- signal_time: timestamp (nullable = true)\n",
      "\n",
      "Successfully written 2172605 documents\n"
     ]
    }
   ],
   "source": [
    "signals_collection = engine.create_collection(\"signals\")\n",
    "signals_collection.write(from_csv(\"data/retrotech/signals.csv\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Success!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You have now indexed the RetroTech product catalog and signals into the search engine, and run a sample query against the product collection. The results don't look very relevant using the out of the box keyword scoring function, of course, but we'll be working to improve that throughout the rest of this book!\n",
    "\n",
    "In the next section, we'll take a look at our first crowd-sourced AI-powered search technique: [Signals Boosting](2.signals-boosting.ipynb). "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
